#Importing necessary Packages
import warnings
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pylab import rcParams
%matplotlib inline
#Loading the dataset at zip code granular
##https://www.zillow.com/research/data/
df = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv')
##display 5 records
df.head(5)
| RegionID | SizeRank | RegionName | RegionType | StateName | State | City | Metro | CountyName | 2000-01-31 | ... | 2024-03-31 | 2024-04-30 | 2024-05-31 | 2024-06-30 | 2024-07-31 | 2024-08-31 | 2024-09-30 | 2024-10-31 | 2024-11-30 | 2024-12-31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 91982 | 1 | 77494 | zip | TX | TX | Katy | Houston-The Woodlands-Sugar Land, TX | Fort Bend County | 209183.857849 | ... | 490931.167974 | 493431.867652 | 494729.768844 | 495028.711253 | 494651.061479 | 495058.369196 | 495747.641095 | 496870.232365 | 497591.216589 | 498013.381149 |
| 1 | 61148 | 2 | 8701 | zip | NJ | NJ | Lakewood | New York-Newark-Jersey City, NY-NJ-PA | Ocean County | 154922.985332 | ... | 689106.574503 | 699661.801780 | 708559.971483 | 715094.582736 | 720443.771066 | 726000.038135 | 733103.210025 | 740083.000281 | 745160.491341 | 747964.767711 |
| 2 | 91940 | 3 | 77449 | zip | TX | TX | Katy | Houston-The Woodlands-Sugar Land, TX | Harris County | 103712.508410 | ... | 280754.700629 | 281717.848068 | 282255.119080 | 282218.313457 | 281986.353330 | 281807.105654 | 281620.894517 | 281206.739260 | 280422.308405 | 279697.082664 |
| 3 | 62080 | 4 | 11368 | zip | NY | NY | New York | New York-Newark-Jersey City, NY-NJ-PA | Queens County | 302696.952652 | ... | 885946.861384 | 895038.513007 | 901396.636146 | 901967.282407 | 900606.279548 | 899885.353997 | 902459.710773 | 904617.265178 | 905818.780045 | 905896.339783 |
| 4 | 91733 | 5 | 77084 | zip | TX | TX | Houston | Houston-The Woodlands-Sugar Land, TX | Harris County | 102666.820965 | ... | 274609.487817 | 275605.670852 | 276105.868652 | 275935.732220 | 275641.755834 | 275374.731211 | 275130.328740 | 274631.886651 | 274039.496837 | 273706.228700 |
5 rows × 309 columns
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
Missing values in each column:
RegionID 0
SizeRank 0
RegionName 0
RegionType 0
StateName 0
..
2024-08-31 0
2024-09-30 0
2024-10-31 0
2024-11-30 0
2024-12-31 0
Length: 309, dtype: int64
# Drop columns with all missing values
df_cleaned = df.dropna(axis=1, how='all')
# Fill missing values with the mean of the column
#dd_cleaned = df_cleaned.fillna(data_cleaned.mean())
# Display the cleaned dataframe
print(df_cleaned.head(5))
RegionID SizeRank RegionName RegionType StateName State City \
0 91982 1 77494 zip TX TX Katy
1 61148 2 8701 zip NJ NJ Lakewood
2 91940 3 77449 zip TX TX Katy
3 62080 4 11368 zip NY NY New York
4 91733 5 77084 zip TX TX Houston
Metro CountyName 2000-01-31 \
0 Houston-The Woodlands-Sugar Land, TX Fort Bend County 209183.857849
1 New York-Newark-Jersey City, NY-NJ-PA Ocean County 154922.985332
2 Houston-The Woodlands-Sugar Land, TX Harris County 103712.508410
3 New York-Newark-Jersey City, NY-NJ-PA Queens County 302696.952652
4 Houston-The Woodlands-Sugar Land, TX Harris County 102666.820965
... 2024-03-31 2024-04-30 2024-05-31 2024-06-30 \
0 ... 490931.167974 493431.867652 494729.768844 495028.711253
1 ... 689106.574503 699661.801780 708559.971483 715094.582736
2 ... 280754.700629 281717.848068 282255.119080 282218.313457
3 ... 885946.861384 895038.513007 901396.636146 901967.282407
4 ... 274609.487817 275605.670852 276105.868652 275935.732220
2024-07-31 2024-08-31 2024-09-30 2024-10-31 2024-11-30 \
0 494651.061479 495058.369196 495747.641095 496870.232365 497591.216589
1 720443.771066 726000.038135 733103.210025 740083.000281 745160.491341
2 281986.353330 281807.105654 281620.894517 281206.739260 280422.308405
3 900606.279548 899885.353997 902459.710773 904617.265178 905818.780045
4 275641.755834 275374.731211 275130.328740 274631.886651 274039.496837
2024-12-31
0 498013.381149
1 747964.767711
2 279697.082664
3 905896.339783
4 273706.228700
[5 rows x 309 columns]
# Distribution of home values in a specific region over time
region_name = '68104'
region_data = df_cleaned[df_cleaned['RegionName'] == int(region_name)].iloc[:, 10:].T
region_data.columns = ['HomeValue']
region_data.index = pd.to_datetime(region_data.index)
plt.figure(figsize=(12, 6))
plt.plot(region_data.index, region_data['HomeValue'], marker='o',color='blue')
plt.title(f'Home Values Over Time in Region {region_name}')
plt.xlabel('Year')
plt.ylabel('Home Value')
plt.grid(True)
plt.show()
print('''The above chart shows the property value trends over time for zip code 68104.
### Summary of the Chart:
X-axis: Represents the time period from around 2000 to 2024.
Y-axis: Represents the property values.
### Key Observations:
Starting Value: Property values started at approximately $60,000 in 2000.
Trend: There is a general upward trend in property values over this period, with some fluctuations.
Significant Increases: Notable increases are observed starting around 2012, with a steep rise continuing through to 2024.
Ending Value: By 2024, property values have reached approximately $180,000.
### Interpretation of the Chart:
Substantial Growth: Property values in zip code 68104 have shown substantial growth over the past two decades.
Stable Period: From 2000 to around 2012, property values remained relatively stable.
Marked Increase: After 2012, there was a marked increase in property values, suggesting increasing demand or other economic factors positively influencing property prices in this area.
Economic Factors: The upward trend indicates that the area has become more desirable, possibly due to improvements in infrastructure, amenities, or overall economic growth.
''')
The above chart shows the property value trends over time for zip code 68104. ### Summary of the Chart: X-axis: Represents the time period from around 2000 to 2024. Y-axis: Represents the property values. ### Key Observations: Starting Value: Property values started at approximately $60,000 in 2000. Trend: There is a general upward trend in property values over this period, with some fluctuations. Significant Increases: Notable increases are observed starting around 2012, with a steep rise continuing through to 2024. Ending Value: By 2024, property values have reached approximately $180,000. ### Interpretation of the Chart: Substantial Growth: Property values in zip code 68104 have shown substantial growth over the past two decades. Stable Period: From 2000 to around 2012, property values remained relatively stable. Marked Increase: After 2012, there was a marked increase in property values, suggesting increasing demand or other economic factors positively influencing property prices in this area. Economic Factors: The upward trend indicates that the area has become more desirable, possibly due to improvements in infrastructure, amenities, or overall economic growth.
#Dataframe containing only data for Metro= 'Omaha-Council Bluffs, NE-IA' and state NE
Douglas_County_df = df_cleaned[(df_cleaned['Metro'] == 'Omaha-Council Bluffs, NE-IA') & (df['State'] =='NE')]
Douglas_County_df.head(5) ##74 rows
| RegionID | SizeRank | RegionName | RegionType | StateName | State | City | Metro | CountyName | 2000-01-31 | ... | 2024-03-31 | 2024-04-30 | 2024-05-31 | 2024-06-30 | 2024-07-31 | 2024-08-31 | 2024-09-30 | 2024-10-31 | 2024-11-30 | 2024-12-31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2297 | 87855 | 2327 | 68104 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 61845.391049 | ... | 185160.710986 | 186644.242877 | 187791.405578 | 188147.951130 | 188347.104765 | 187877.484359 | 187293.870262 | 186750.679296 | 186918.838632 | 188001.320669 |
| 2978 | 87866 | 3016 | 68116 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 160146.962255 | ... | 360014.507509 | 361201.696645 | 362065.723010 | 362382.759677 | 362877.616611 | 363285.613712 | 363814.343178 | 364471.070866 | 365379.998067 | 366662.100117 |
| 3209 | 87811 | 3249 | 68022 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 232227.952627 | ... | 489064.713115 | 491103.233097 | 492741.813566 | 493406.193632 | 493826.757872 | 494095.134680 | 494514.837025 | 495216.566507 | 495832.424226 | 496803.212295 |
| 3315 | 87880 | 3356 | 68134 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 98567.912015 | ... | 238592.511319 | 239866.692413 | 240967.030857 | 241484.981002 | 241982.593922 | 241972.676199 | 241969.955464 | 241884.931339 | 242249.207926 | 243141.689693 |
| 3433 | 87830 | 3474 | 68046 | zip | NE | NE | Papillion | Omaha-Council Bluffs, NE-IA | Sarpy County | 170796.025558 | ... | 382245.904584 | 382833.611747 | 383008.730987 | 382491.955618 | 382419.505359 | 382979.577767 | 384197.039017 | 385515.919123 | 386691.873029 | 387832.049927 |
5 rows × 309 columns
Douglas_County_df.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv")
#Zip Codes located within the Douglas County region
fig, ax = plt.subplots(figsize=(15,12))
y = [a for a in Douglas_County_df['City'].value_counts()]
x = [a for a in Douglas_County_df['City'].value_counts().keys()]
ax.barh(x,y,color='brown')
ax.set_title('Cities in Douglas County',fontsize=30)
ax.set_ylabel('Cities',fontsize=20)
ax.set_xlabel('Number of Zipcodes in the City',fontsize=20);
print('''
The chart titled Cities in Douglas County displays the number of zip codes in various cities within Douglas County. Here's a breakdown of the chart:
X- axis: Represents the "Number of Zipcodes in the City," ranging from 0 to 30.
Y-axis: Lists the names of different cities in Douglas County.
### Key Observations:
Omaha Has the highest number of zip codes, close to 30, indicating it is the largest city in terms of zip code distribution.
Bellevue Follows with around 3 zip codes.
Other Cities Cities like Valley, Waterloo, and Bennington have fewer than three zip codes each.
Smaller Cities Most other cities listed have only one or two zip codes.
'''
)
The chart titled Cities in Douglas County displays the number of zip codes in various cities within Douglas County. Here's a breakdown of the chart: X- axis: Represents the "Number of Zipcodes in the City," ranging from 0 to 30. Y-axis: Lists the names of different cities in Douglas County. ### Key Observations: Omaha Has the highest number of zip codes, close to 30, indicating it is the largest city in terms of zip code distribution. Bellevue Follows with around 3 zip codes. Other Cities Cities like Valley, Waterloo, and Bennington have fewer than three zip codes each. Smaller Cities Most other cities listed have only one or two zip codes.
#Filtering for Customers with budget of 300000 in Omaha-Council Bluffs, NE-IA Metro
Douglas_County_budget = Douglas_County_df[Douglas_County_df['2024-12-31']<=300000]
Douglas_County_budget.head(5)
| RegionID | SizeRank | RegionName | RegionType | StateName | State | City | Metro | CountyName | 2000-01-31 | ... | 2024-03-31 | 2024-04-30 | 2024-05-31 | 2024-06-30 | 2024-07-31 | 2024-08-31 | 2024-09-30 | 2024-10-31 | 2024-11-30 | 2024-12-31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2297 | 87855 | 2327 | 68104 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 61845.391049 | ... | 185160.710986 | 186644.242877 | 187791.405578 | 188147.951130 | 188347.104765 | 187877.484359 | 187293.870262 | 186750.679296 | 186918.838632 | 188001.320669 |
| 3315 | 87880 | 3356 | 68134 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 98567.912015 | ... | 238592.511319 | 239866.692413 | 240967.030857 | 241484.981002 | 241982.593922 | 241972.676199 | 241969.955464 | 241884.931339 | 242249.207926 | 243141.689693 |
| 3637 | 87858 | 3680 | 68107 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | NaN | ... | 169186.248724 | 170364.472709 | 171447.742422 | 172266.922477 | 172682.737537 | 172473.963275 | 171863.443067 | 171391.584374 | 171530.134221 | 172501.351022 |
| 3756 | 87894 | 3801 | 68164 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 121613.700108 | ... | 273720.719422 | 275016.216030 | 275950.743521 | 276211.076942 | 276273.706253 | 276165.584832 | 276466.455383 | 276778.757067 | 277302.349579 | 278065.714655 |
| 4733 | 87883 | 4795 | 68137 | zip | NE | NE | Omaha | Omaha-Council Bluffs, NE-IA | Douglas County | 115569.772369 | ... | 263510.808942 | 264846.039979 | 265950.025928 | 266242.996832 | 266468.031601 | 266588.847157 | 266886.168878 | 267178.123198 | 267585.412526 | 268627.929641 |
5 rows × 309 columns
#List of Zip Codes in theOmaha-Council Bluffs, NE-IA Metro within client's budget.
Douglas_County_budget_zips = [a for a in Douglas_County_budget['RegionName']]
#Fetching necessary data from the dataframe for the bar chart
#Getting City Names
city_names = []
for zipcode in Douglas_County_budget_zips:
city_names.append(Douglas_County_budget[Douglas_County_budget['RegionName']==zipcode].iloc[0]['City'])
#Retrieving number of zipcodes for each city and feeding it to count_dict
count_dict = {}
for a in Counter(city_names).keys():
count_dict[a] = Counter(city_names)[a]
#sorting dictionary into ascending order
count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1]))
#Building the bar chart
fig,ax = plt.subplots(figsize=(21,12))
x_labels = [a for a in count_dict.keys()]
x = list(range(1,len(x_labels)+1))
y = [a for a in count_dict.values()]
ax.bar(x,y,color='rebeccapurple',width=0.5)
ax.set_xticks(x)
ax.set_xticklabels(x_labels,fontsize='12',rotation=90)
ax.set_title("Cities with zipcodes fitting our Client's budget",fontsize='30')
ax.set_ylabel("Number of Zipcodes",fontsize='20')
ax.set_xlabel("Cities in Douglas County",fontsize='20');
print('''The chart titled Cities with zipcodes fitting our Client's budget shows the number of zip codes in various cities within Douglas County that fit the client's budget.
X-axis: Represents different cities in Douglas County.
Y-axis: Represents the number of zip codes fitting the client's budget, ranging from 0 to 20, with increments of 2.5.
### Key Observations:
Omaha: Has the highest number of zip codes fitting the client's budget, with approximately 18 zip codes.
Bellevue: Has a moderate number of zip codes, around 5.
Other Cities: Cities like La Vista, Ralston, Millard, Bennington, Waterloo, Elkhorn, Valley, Cedar Bluffs, Gretna, Springfield, Papillion, and Boys Town have fewer zip codes, mostly around 1 or less.
This bar chart effectively illustrates that Omaha has the most zip codes that fit the client's budget, significantly more than any other city in Douglas County.
''')
The chart titled Cities with zipcodes fitting our Client's budget shows the number of zip codes in various cities within Douglas County that fit the client's budget. X-axis: Represents different cities in Douglas County. Y-axis: Represents the number of zip codes fitting the client's budget, ranging from 0 to 20, with increments of 2.5. ### Key Observations: Omaha: Has the highest number of zip codes fitting the client's budget, with approximately 18 zip codes. Bellevue: Has a moderate number of zip codes, around 5. Other Cities: Cities like La Vista, Ralston, Millard, Bennington, Waterloo, Elkhorn, Valley, Cedar Bluffs, Gretna, Springfield, Papillion, and Boys Town have fewer zip codes, mostly around 1 or less. This bar chart effectively illustrates that Omaha has the most zip codes that fit the client's budget, significantly more than any other city in Douglas County.
#Douglas_County_budget
melted_temp = pd.melt(Douglas_County_budget, id_vars=['RegionName', 'RegionID', 'SizeRank', 'City', 'State', 'Metro', 'CountyName'], var_name='time')
melted_temp.head(1000)
melted_zip = melted_temp[(melted_temp['Metro'] == 'Omaha-Council Bluffs, NE-IA') & (melted_temp['time'] != 'RegionType') & (melted_temp['time'] != 'StateName')]## & (melted_temp['RegionName'] == '68104')]
##melted_zip.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv")
import pandas as pd
import matplotlib.pyplot as plt
#melted_zip = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv')
# Ensure 'melted_zip' is a copy to avoid SettingWithCopyWarning
melted_zip_time = melted_zip.copy()
# Convert 'time' column to datetime format
melted_zip_time.loc[:, 'time'] = pd.to_datetime(melted_zip_time['time'])
# Drop rows with missing values in 'value' column
melted_zip_time = melted_zip_time.dropna(subset=['value'])
# Convert 'value' column to numeric, forcing errors to NaN
melted_zip_time.loc[:, 'value'] = pd.to_numeric(melted_zip_time['value'], errors='coerce')
# Drop any remaining NaN values
df_cleaned_time = melted_zip_time.dropna(subset=['value'])
df_cleaned_time['year'] = df_cleaned_time['time'].dt.year
# Plot time series for each zip code
plt.figure(figsize=(12, 6))
for zip_code, data in df_cleaned_time.groupby('RegionName'):
plt.plot(data['time'], data['value'], label=zip_code, alpha=0.5)
plt.xlabel("Time", fontsize=12)
plt.ylabel("Value", fontsize=12)
plt.title("Time Series Plot of Property Values for Each Zip Code", fontsize=14)
plt.xticks(rotation=45, fontsize=10) # Rotate x-axis labels for better readability
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title="Zip Code", ncol=2)
plt.grid(True)
plt.tight_layout() # Prevents label overlap
plt.show()
print('''
The time series chart titled Time Series Plot of Property Values for Each Zip Code shows the trends in property values over time for different zip codes.
X-axis: Represents the time in years, ranging from 2000 to 2024.
Y-axis: Represents the property values.
### Key Observations:
Multiple Lines: Each line on the chart corresponds to a different zip code, showing how property values have changed over time for each zip code.
Trends: Most lines show an increase in property values over the period, particularly after 2012. This indicates a general upward trend in property values for the zip codes included in the chart.
Legend: The legend on the right side lists the zip codes, helping to identify which line corresponds to which zip code.
This chart effectively illustrates the historical trends in property values for various zip codes, highlighting periods of growth and stability.
''')
The time series chart titled Time Series Plot of Property Values for Each Zip Code shows the trends in property values over time for different zip codes. X-axis: Represents the time in years, ranging from 2000 to 2024. Y-axis: Represents the property values. ### Key Observations: Multiple Lines: Each line on the chart corresponds to a different zip code, showing how property values have changed over time for each zip code. Trends: Most lines show an increase in property values over the period, particularly after 2012. This indicates a general upward trend in property values for the zip codes included in the chart. Legend: The legend on the right side lists the zip codes, helping to identify which line corresponds to which zip code. This chart effectively illustrates the historical trends in property values for various zip codes, highlighting periods of growth and stability.
##Distribution of Property Prices
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned_time['value'], bins=50, kde=True)
plt.xlabel("Property Value")
plt.ylabel("Frequency")
plt.title("Distribution of Property Prices")
plt.grid(True)
plt.show()
print('''The chart "Distribution of Property Prices" visualizes the distribution of property values.
### Summary of the Chart:
X-axis:: Represents property values, ranging from 0 to 300,000.
Y-axis: Represents the frequency of properties within each value range.
Histogram: Shows the distribution of property values with 50 bins.
KDE Line: A Kernel Density Estimate (KDE) line is overlaid to provide a smooth estimate of the probability density function for property values.
### Key Observations:
--> Most property values are concentrated between approximately 50,000 and 150,000.
-->There is a significant peak in frequency around property values between 75,000 and 100,000.
### Interpretation of the Chart:
-->Property Value Range: The majority of properties fall within the 50,000 to 150,000 range, suggesting that this is the most common price range for properties in the dataset.
-->High Demand Range: The peak around 75,000 to 100,000 indicates a high demand or a large number of properties valued within this range.
This chart effectively illustrates the distribution of property prices, highlighting the most common value ranges and providing a clear picture of the property market.
''')
The chart "Distribution of Property Prices" visualizes the distribution of property values. ### Summary of the Chart: X-axis:: Represents property values, ranging from 0 to 300,000. Y-axis: Represents the frequency of properties within each value range. Histogram: Shows the distribution of property values with 50 bins. KDE Line: A Kernel Density Estimate (KDE) line is overlaid to provide a smooth estimate of the probability density function for property values. ### Key Observations: --> Most property values are concentrated between approximately 50,000 and 150,000. -->There is a significant peak in frequency around property values between 75,000 and 100,000. ### Interpretation of the Chart: -->Property Value Range: The majority of properties fall within the 50,000 to 150,000 range, suggesting that this is the most common price range for properties in the dataset. -->High Demand Range: The peak around 75,000 to 100,000 indicates a high demand or a large number of properties valued within this range. This chart effectively illustrates the distribution of property prices, highlighting the most common value ranges and providing a clear picture of the property market.
df_cleaned_time.head(5)
| RegionName | RegionID | SizeRank | City | State | Metro | CountyName | time | value | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 70 | 68104 | 87855 | 2327 | Omaha | NE | Omaha-Council Bluffs, NE-IA | Douglas County | 2000-01-31 | 61845.391049 | 2000 |
| 71 | 68134 | 87880 | 3356 | Omaha | NE | Omaha-Council Bluffs, NE-IA | Douglas County | 2000-01-31 | 98567.912015 | 2000 |
| 73 | 68164 | 87894 | 3801 | Omaha | NE | Omaha-Council Bluffs, NE-IA | Douglas County | 2000-01-31 | 121613.700108 | 2000 |
| 74 | 68137 | 87883 | 4795 | Omaha | NE | Omaha-Council Bluffs, NE-IA | Douglas County | 2000-01-31 | 115569.772369 | 2000 |
| 75 | 68144 | 87887 | 4992 | Omaha | NE | Omaha-Council Bluffs, NE-IA | Douglas County | 2000-01-31 | 117433.426333 | 2000 |
##Boxplot of Property Values by Year
plt.figure(figsize=(12, 6))
sns.boxplot(x='year', y='value', data=df_cleaned_time)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Property Value")
plt.title("Property Value Distribution Over the Years")
plt.grid(True)
plt.show()
print('''The chart is a boxplot of property values by year from 2000 to 2024.
### Summary of the Chart:
X-axis: Represents the years from 2000 to 2024.
Y-axis: Represents property values.
-->Boxplot: Each boxplot corresponds to a specific year and shows the distribution of property values for that year.
-->Boxes: Represent the interquartile range (IQR), with the line inside each box indicating the median value.
-->Whiskers: Extend from each box to show variability outside the upper and lower quartiles.
-->Outliers: Plotted as individual points beyond the whiskers, indicating property values significantly higher or lower than the majority.
### Interpretation of the Chart:
-->Increasing Property Values: Property values have generally increased over time from 2000 to 2024.
-->Variability: There is noticeable variability in property values each year, with some years showing wider IQRs than others.
-->Rising Median Values: Median property values have risen consistently over this period, indicating an overall upward trend in property prices.
-->Outliers: Present in most years, suggesting that there are some properties with significantly higher or lower values compared to the majority.
This boxplot effectively illustrates the distribution and trends of property values over time, highlighting the
overall increase in property prices and the variability within each year.
'''
)
The chart is a boxplot of property values by year from 2000 to 2024. ### Summary of the Chart: X-axis: Represents the years from 2000 to 2024. Y-axis: Represents property values. -->Boxplot: Each boxplot corresponds to a specific year and shows the distribution of property values for that year. -->Boxes: Represent the interquartile range (IQR), with the line inside each box indicating the median value. -->Whiskers: Extend from each box to show variability outside the upper and lower quartiles. -->Outliers: Plotted as individual points beyond the whiskers, indicating property values significantly higher or lower than the majority. ### Interpretation of the Chart: -->Increasing Property Values: Property values have generally increased over time from 2000 to 2024. -->Variability: There is noticeable variability in property values each year, with some years showing wider IQRs than others. -->Rising Median Values: Median property values have risen consistently over this period, indicating an overall upward trend in property prices. -->Outliers: Present in most years, suggesting that there are some properties with significantly higher or lower values compared to the majority. This boxplot effectively illustrates the distribution and trends of property values over time, highlighting the overall increase in property prices and the variability within each year.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
import statsmodels.api as sm
# Load the data from the CSV file
data = df #pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv')
# Display the first few rows of the dataframe
print(data.head(30))
RegionID SizeRank RegionName RegionType StateName State City \
0 91982 1 77494 zip TX TX Katy
1 61148 2 8701 zip NJ NJ Lakewood
2 91940 3 77449 zip TX TX Katy
3 62080 4 11368 zip NY NY New York
4 91733 5 77084 zip TX TX Houston
5 93144 6 79936 zip TX TX El Paso
6 62093 7 11385 zip NY NY New York
7 92593 8 78660 zip TX TX Pflugerville
8 62019 9 11208 zip NY NY New York
9 95992 10 90011 zip CA CA Los Angeles
10 91926 11 77433 zip TX TX Cypress
11 84630 12 60629 zip IL IL Chicago
12 96193 13 90650 zip CA CA Norwalk
13 96361 14 91331 zip CA CA Los Angeles
14 74101 15 37013 zip TN TN Nashville
15 62046 16 11236 zip NY NY New York
16 61807 17 10467 zip NY NY New York
17 62085 18 11373 zip NY NY New York
18 62037 19 11226 zip NY NY New York
19 62018 20 11207 zip NY NY New York
20 70829 21 30044 zip GA GA Lawrenceville
21 96025 22 90044 zip CA CA Los Angeles
22 96817 23 92336 zip CA CA Fontana
23 97771 24 94565 zip CA CA Pittsburg
24 92036 25 77573 zip TX TX League City
25 96816 26 92335 zip CA CA Fontana
26 96083 27 90201 zip CA CA Bell
27 96107 28 90250 zip CA CA Hawthorne
28 61639 29 10025 zip NY NY New York
29 90654 30 75052 zip TX TX Grand Prairie
Metro CountyName \
0 Houston-The Woodlands-Sugar Land, TX Fort Bend County
1 New York-Newark-Jersey City, NY-NJ-PA Ocean County
2 Houston-The Woodlands-Sugar Land, TX Harris County
3 New York-Newark-Jersey City, NY-NJ-PA Queens County
4 Houston-The Woodlands-Sugar Land, TX Harris County
5 El Paso, TX El Paso County
6 New York-Newark-Jersey City, NY-NJ-PA Queens County
7 Austin-Round Rock-Georgetown, TX Travis County
8 New York-Newark-Jersey City, NY-NJ-PA Kings County
9 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
10 Houston-The Woodlands-Sugar Land, TX Harris County
11 Chicago-Naperville-Elgin, IL-IN-WI Cook County
12 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
13 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
14 Nashville-Davidson--Murfreesboro--Franklin, TN Davidson County
15 New York-Newark-Jersey City, NY-NJ-PA Kings County
16 New York-Newark-Jersey City, NY-NJ-PA Bronx County
17 New York-Newark-Jersey City, NY-NJ-PA Queens County
18 New York-Newark-Jersey City, NY-NJ-PA Kings County
19 New York-Newark-Jersey City, NY-NJ-PA Kings County
20 Atlanta-Sandy Springs-Alpharetta, GA Gwinnett County
21 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
22 Riverside-San Bernardino-Ontario, CA San Bernardino County
23 San Francisco-Oakland-Berkeley, CA Contra Costa County
24 Houston-The Woodlands-Sugar Land, TX Galveston County
25 Riverside-San Bernardino-Ontario, CA San Bernardino County
26 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
27 Los Angeles-Long Beach-Anaheim, CA Los Angeles County
28 New York-Newark-Jersey City, NY-NJ-PA New York County
29 Dallas-Fort Worth-Arlington, TX Dallas County
2000-01-31 ... 2024-03-31 2024-04-30 2024-05-31 \
0 209183.857849 ... 4.909312e+05 4.934319e+05 4.947298e+05
1 154922.985332 ... 6.891066e+05 6.996618e+05 7.085600e+05
2 103712.508410 ... 2.807547e+05 2.817178e+05 2.822551e+05
3 302696.952652 ... 8.859469e+05 8.950385e+05 9.013966e+05
4 102666.820965 ... 2.746095e+05 2.756057e+05 2.761059e+05
5 89296.318317 ... 2.087716e+05 2.101550e+05 2.112621e+05
6 275372.206968 ... 8.165990e+05 8.218378e+05 8.249659e+05
7 156630.893790 ... 4.038166e+05 4.036711e+05 4.032774e+05
8 187881.614494 ... 6.795292e+05 6.835815e+05 6.864174e+05
9 112751.607515 ... 5.682337e+05 5.665444e+05 5.674512e+05
10 188219.104775 ... 4.303793e+05 4.320427e+05 4.327555e+05
11 94129.380468 ... 2.357525e+05 2.389012e+05 2.415132e+05
12 157854.835028 ... 7.201091e+05 7.218172e+05 7.254325e+05
13 125683.830348 ... 7.063197e+05 7.056936e+05 7.072474e+05
14 123561.218927 ... 3.669590e+05 3.681828e+05 3.693311e+05
15 216374.206832 ... 6.705887e+05 6.733294e+05 6.745830e+05
16 165851.804503 ... 6.091809e+05 6.158641e+05 6.201697e+05
17 280787.914972 ... 9.187083e+05 9.252334e+05 9.309053e+05
18 263959.505980 ... 1.106530e+06 1.116589e+06 1.126436e+06
19 173128.169383 ... 6.416483e+05 6.440602e+05 6.449091e+05
20 135548.780295 ... 3.618449e+05 3.641860e+05 3.660326e+05
21 127861.032846 ... 6.156394e+05 6.142426e+05 6.153391e+05
22 162735.847249 ... 6.842914e+05 6.883012e+05 6.927089e+05
23 128210.120549 ... 5.834464e+05 5.872290e+05 5.907028e+05
24 NaN ... 3.795797e+05 3.805545e+05 3.813277e+05
25 100734.402730 ... 5.405824e+05 5.420519e+05 5.453330e+05
26 143328.878914 ... 6.540787e+05 6.541549e+05 6.553819e+05
27 192046.275566 ... 8.390104e+05 8.374998e+05 8.397481e+05
28 NaN ... 1.375445e+06 1.366103e+06 1.357637e+06
29 106423.904221 ... 3.303254e+05 3.307599e+05 3.308807e+05
2024-06-30 2024-07-31 2024-08-31 2024-09-30 2024-10-31 \
0 4.950287e+05 4.946511e+05 4.950584e+05 4.957476e+05 4.968702e+05
1 7.150946e+05 7.204438e+05 7.260000e+05 7.331032e+05 7.400830e+05
2 2.822183e+05 2.819864e+05 2.818071e+05 2.816209e+05 2.812067e+05
3 9.019673e+05 9.006063e+05 8.998854e+05 9.024597e+05 9.046173e+05
4 2.759357e+05 2.756418e+05 2.753747e+05 2.751303e+05 2.746319e+05
5 2.117728e+05 2.119963e+05 2.121595e+05 2.124250e+05 2.126144e+05
6 8.241853e+05 8.235255e+05 8.248570e+05 8.286172e+05 8.320040e+05
7 4.022068e+05 4.008842e+05 3.997884e+05 3.992148e+05 3.982709e+05
8 6.865275e+05 6.868643e+05 6.877204e+05 6.895644e+05 6.899895e+05
9 5.683689e+05 5.709211e+05 5.749465e+05 5.808213e+05 5.862152e+05
10 4.323321e+05 4.313540e+05 4.309533e+05 4.309909e+05 4.310984e+05
11 2.439708e+05 2.455966e+05 2.473541e+05 2.485644e+05 2.496615e+05
12 7.283206e+05 7.323845e+05 7.374218e+05 7.433161e+05 7.478709e+05
13 7.096395e+05 7.141819e+05 7.209321e+05 7.282843e+05 7.344391e+05
14 3.699239e+05 3.704216e+05 3.703718e+05 3.702954e+05 3.699462e+05
15 6.736895e+05 6.732175e+05 6.744504e+05 6.778572e+05 6.812262e+05
16 6.206369e+05 6.191099e+05 6.170219e+05 6.152134e+05 6.139342e+05
17 9.315913e+05 9.313808e+05 9.311822e+05 9.320977e+05 9.347486e+05
18 1.131265e+06 1.132676e+06 1.133803e+06 1.136250e+06 1.137471e+06
19 6.440674e+05 6.433077e+05 6.434067e+05 6.445494e+05 6.454747e+05
20 3.671151e+05 3.679698e+05 3.686291e+05 3.688152e+05 3.683133e+05
21 6.166183e+05 6.195873e+05 6.240806e+05 6.301036e+05 6.351262e+05
22 6.969221e+05 7.004450e+05 7.027734e+05 7.049159e+05 7.065207e+05
23 5.918456e+05 5.915789e+05 5.906498e+05 5.890761e+05 5.868436e+05
24 3.817400e+05 3.820621e+05 3.823446e+05 3.827447e+05 3.829185e+05
25 5.483909e+05 5.513960e+05 5.534808e+05 5.559602e+05 5.578559e+05
26 6.557622e+05 6.578933e+05 6.622184e+05 6.686316e+05 6.737822e+05
27 8.419164e+05 8.458017e+05 8.503824e+05 8.566929e+05 8.622520e+05
28 1.341787e+06 1.344609e+06 1.359405e+06 1.372631e+06 1.370689e+06
29 3.306506e+05 3.304376e+05 3.302504e+05 3.301694e+05 3.298518e+05
2024-11-30 2024-12-31
0 4.975912e+05 4.980134e+05
1 7.451605e+05 7.479648e+05
2 2.804223e+05 2.796971e+05
3 9.058188e+05 9.058963e+05
4 2.740395e+05 2.737062e+05
5 2.127732e+05 2.129577e+05
6 8.348912e+05 8.369265e+05
7 3.970073e+05 3.950774e+05
8 6.896089e+05 6.882254e+05
9 5.909170e+05 5.945645e+05
10 4.307925e+05 4.304035e+05
11 2.507528e+05 2.518538e+05
12 7.510449e+05 7.533441e+05
13 7.395935e+05 7.448252e+05
14 3.697126e+05 3.695855e+05
15 6.846801e+05 6.865968e+05
16 6.110946e+05 6.065055e+05
17 9.388682e+05 9.445741e+05
18 1.137110e+06 1.138285e+06
19 6.457403e+05 6.442788e+05
20 3.671993e+05 3.661526e+05
21 6.391128e+05 6.418200e+05
22 7.084834e+05 7.096680e+05
23 5.851863e+05 5.845002e+05
24 3.827001e+05 3.823055e+05
25 5.602846e+05 5.622288e+05
26 6.768174e+05 6.785491e+05
27 8.682955e+05 8.738884e+05
28 1.365597e+06 1.362692e+06
29 3.292662e+05 3.284714e+05
[30 rows x 309 columns]
# Bar Chart
#plt.figure(figsize=(10, 6))
#data['RegionName'].value_counts().plot(kind='bar')
#plt.title('Distribution of Region Names')
#plt.xlabel('Region Name')
#plt.ylabel('Count')
#plt.show()
# QQ Plot
plt.figure(figsize=(10, 6))
sm.qqplot(data['2024-01-31'], line ='45')
plt.title('QQ Plot of Home Values on 01/31/2024')
plt.show()
<Figure size 720x432 with 0 Axes>
print('''
Summary of the Chart:
The QQ plot (Quantile-Quantile plot) compares the distribution of home values on 01/31/2003 to a theoretical normal distribution.
Points on the plot represent the quantiles of the data against the quantiles of the normal distribution.
Interpretation of the Chart:
If the data follows a normal distribution, the points will lie along the 45-degree reference line.
Deviations from this line indicate departures from normality.
In this case, the QQ plot helps us assess whether the home values on 01/31/2004 are normally distributed.
Significant deviations from the line suggest that the data may not be normally distributed.
''')
Summary of the Chart: The QQ plot (Quantile-Quantile plot) compares the distribution of home values on 01/31/2003 to a theoretical normal distribution. Points on the plot represent the quantiles of the data against the quantiles of the normal distribution. Interpretation of the Chart: If the data follows a normal distribution, the points will lie along the 45-degree reference line. Deviations from this line indicate departures from normality. In this case, the QQ plot helps us assess whether the home values on 01/31/2004 are normally distributed. Significant deviations from the line suggest that the data may not be normally distributed.
# Pair Plot (Scatter plot and Density Plots)
sns.pairplot(data[['2024-10-31', '2024-11-30', '2024-12-31']])
plt.show()
print('''
##Summary of the Chart:
-->The pair plot includes scatter plots for each pair of variables and density plots along the diagonal.
]The variables are dates: `2024-10-31`, `2024-11-30`, and `2024-12-31`.
##Interpretation of the Chart:
1.Diagonal Elements:
--> The diagonal elements are density plots showing the distribution of values for each date.
--> For `2024-10-31`, `2024-11-30`, and `2024-12-31`, there is a clear peak indicating that most values are concentrated around a specific range.
2. Off-Diagonal Elements:
--> The off-diagonal elements are scatter plots showing pairwise relationships between dates.
--> There is a strong positive linear relationship between all pairs (`2024-10-31` vs. `2024-11-30`, `2024-10-31` vs. `2024-12-31`, and `2024-11-30` vs. `2024-12-31`).
This is indicated by points forming an upward-sloping line in each scatter plot.
There is a strong correlation between the data points on these three different dates, suggesting that as one variable increases,
so do the others in a linear fashion.
This can be useful for understanding trends and patterns in the data over time.
''')
##Summary of the Chart: -->The pair plot includes scatter plots for each pair of variables and density plots along the diagonal. ]The variables are dates: `2024-10-31`, `2024-11-30`, and `2024-12-31`. ##Interpretation of the Chart: 1.Diagonal Elements: --> The diagonal elements are density plots showing the distribution of values for each date. --> For `2024-10-31`, `2024-11-30`, and `2024-12-31`, there is a clear peak indicating that most values are concentrated around a specific range. 2. Off-Diagonal Elements: --> The off-diagonal elements are scatter plots showing pairwise relationships between dates. --> There is a strong positive linear relationship between all pairs (`2024-10-31` vs. `2024-11-30`, `2024-10-31` vs. `2024-12-31`, and `2024-11-30` vs. `2024-12-31`). This is indicated by points forming an upward-sloping line in each scatter plot. There is a strong correlation between the data points on these three different dates, suggesting that as one variable increases, so do the others in a linear fashion. This can be useful for understanding trends and patterns in the data over time.
# KMeans Clustering
kmeans = KMeans(n_clusters=3)
data = data.dropna(subset=['2024-10-31', '2024-11-30', '2024-12-31'])
data['Cluster'] = kmeans.fit_predict(data[['2024-10-31', '2024-11-30', '2024-12-31']])
# Scatter plot with clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='2024-10-31' ,y ='2024-11-30', hue='Cluster', data=data, palette='viridis')
plt.title('KMeans Clustering')
plt.xlabel('Home Values on 2024-10-31')
plt.ylabel('Home Values on 2024-11-30')
plt.show()
print('''
--> The chart is a scatter plot showing the results of KMeans clustering on home values from two different dates: 2024-10-31 and 2024-11-30.
The data points are color-coded based on the cluster they belong to, with three clusters in total (Cluster 0, Cluster 1, and Cluster 2).
The x-axis represents home values on 2024-10-31,
while the y-axis represents home values on 2024-11-30.
-->Cluster 0 (yellow): This cluster has a dense concentration along the diagonal line,
indicating that for these homes, the values remained relatively stable between the two dates.
-->Cluster 1 (purple): This cluster also follows a similar pattern but is more spread out compared to Cluster 0.
-->Cluster 2 (green): This cluster shows a wider distribution with some outliers indicating significant changes
in home values between the two dates.
-->Trend Analysis:
- The diagonal trend suggests that most homes did not experience drastic changes in value over this one-month period.
- Outliers in each cluster indicate homes that had significant increases or decreases in value.
there is a strong correlation between the home values on these two dates, with most homes showing stable values.
The clustering helps identify groups of homes with similar value trends, providing insights into regional trends
and market segmentation.
''')
--> The chart is a scatter plot showing the results of KMeans clustering on home values from two different dates: 2024-10-31 and 2024-11-30.
The data points are color-coded based on the cluster they belong to, with three clusters in total (Cluster 0, Cluster 1, and Cluster 2).
The x-axis represents home values on 2024-10-31,
while the y-axis represents home values on 2024-11-30.
-->Cluster 0 (yellow): This cluster has a dense concentration along the diagonal line,
indicating that for these homes, the values remained relatively stable between the two dates.
-->Cluster 1 (purple): This cluster also follows a similar pattern but is more spread out compared to Cluster 0.
-->Cluster 2 (green): This cluster shows a wider distribution with some outliers indicating significant changes
in home values between the two dates.
-->Trend Analysis:
- The diagonal trend suggests that most homes did not experience drastic changes in value over this one-month period.
- Outliers in each cluster indicate homes that had significant increases or decreases in value.
there is a strong correlation between the home values on these two dates, with most homes showing stable values.
The clustering helps identify groups of homes with similar value trends, providing insights into regional trends
and market segmentation.
import os
import pandas as pd
#Function to read all CSV files from a folder and write to a dataframe
def read_csv_files_from_folder(folder_path, encoding='utf-8'):
# List to store individual dataframes
df_list = []
# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
# Check if the file is a CSV file
if file_name.endswith('.csv'):
# Read the CSV file and append the dataframe to the list
file_path = os.path.join(folder_path, file_name)
try:
df = pd.read_csv(file_path, encoding=encoding)
df_list.append(df)
except UnicodeDecodeError:
print(f"Encoding error in file: {file_name}")
# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(df_list, ignore_index=True)
return combined_df
#Specify the folder path containing the CSV files
folder_path = 'C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\CAINC1'
#Read data from all CSV files in the folder and write to a dataframe
combined_df = read_csv_files_from_folder(folder_path, encoding='latin1')
combined_df.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb_income.csv")
combined_df['GeoFIPS'] = combined_df['GeoFIPS'].astype(str).str.replace('"', '')
#Display the first few rows of the combined dataframe
print(combined_df.head())
GeoFIPS GeoName Region TableName LineCode \
0 02000 Alaska 8 CAINC1 1.0
1 02000 Alaska 8 CAINC1 2.0
2 02000 Alaska 8 CAINC1 3.0
3 02010 Aleutian Islands Census Area, AK* 8 CAINC1 1.0
4 02010 Aleutian Islands Census Area, AK* 8 CAINC1 2.0
IndustryClassification Description \
0 ... Personal income (thousands of dollars)
1 ... Population (persons) 1/
2 ... Per capita personal income (dollars) 2/
3 ... Personal income (thousands of dollars)
4 ... Population (persons) 1/
Unit 1969 1970 ... 2014 2015 2016 \
0 Thousands of dollars 1575306 1798953 ... 40642115 41918639 41285498
1 Number of persons 296000 304328 ... 737638 739127 743410
2 Dollars 5322 5911 ... 55098 56714 55535
3 Thousands of dollars (NA) (NA) ... (NA) (NA) (NA)
4 Number of persons (NA) (NA) ... (NA) (NA) (NA)
2017 2018 2019 2020 2021 2022 2023
0 41919375 43522749 44459670 45339636 48183676 50606751 52519998
1 741949 737717 734823 732964 734923 733276 733406
2 56499 58997 60504 61858 65563 69015 71611
3 (NA) (NA) (NA) (NA) (NA) (NA) (NA)
4 (NA) (NA) (NA) (NA) (NA) (NA) (NA)
[5 rows x 63 columns]
combined_df_Personal_income = combined_df[combined_df['LineCode'] == 1.0]
combined_df_Per_capita_Personal_income = combined_df[combined_df['LineCode'] == 3.0]
combined_df_Population = combined_df[combined_df['LineCode'] == 2.0]
combined_df_Personal_income.head()
| GeoFIPS | GeoName | Region | TableName | LineCode | IndustryClassification | Description | Unit | 1969 | 1970 | ... | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 02000 | Alaska | 8 | CAINC1 | 1.0 | ... | Personal income (thousands of dollars) | Thousands of dollars | 1575306 | 1798953 | ... | 40642115 | 41918639 | 41285498 | 41919375 | 43522749 | 44459670 | 45339636 | 48183676 | 50606751 | 52519998 |
| 3 | 02010 | Aleutian Islands Census Area, AK* | 8 | CAINC1 | 1.0 | ... | Personal income (thousands of dollars) | Thousands of dollars | (NA) | (NA) | ... | (NA) | (NA) | (NA) | (NA) | (NA) | (NA) | (NA) | (NA) | (NA) | (NA) |
| 6 | 02013 | Aleutians East Borough, AK* | 8 | CAINC1 | 1.0 | ... | Personal income (thousands of dollars) | Thousands of dollars | (NA) | (NA) | ... | 133578 | 165850 | 191326 | 200602 | 192045 | 202667 | 195030 | 202501 | 217797 | 226422 |
| 9 | 02016 | Aleutians West Census Area, AK* | 8 | CAINC1 | 1.0 | ... | Personal income (thousands of dollars) | Thousands of dollars | (NA) | (NA) | ... | 299213 | 297307 | 306233 | 294135 | 305957 | 315261 | 326510 | 329054 | 350185 | 372727 |
| 12 | 02020 | Anchorage Municipality, AK | 8 | CAINC1 | 1.0 | ... | Personal income (thousands of dollars) | Thousands of dollars | 772088 | 875085 | ... | 18194339 | 18812630 | 18446135 | 18629102 | 19242734 | 19383968 | 19745555 | 20786653 | 21871379 | 22652344 |
5 rows × 63 columns
##pip install addfips
import pandas as pd
import addfips
#Initialize the AddFIPS object
af = addfips.AddFIPS()
#Function to get the FIPS code based on county and state
def get_fips(row):
return af.get_county_fips(row['CountyName'], row['State'])
#Add the GeoFIPS column to the df_zip DataFrame
df['GeoFIPS'] = df.apply(get_fips, axis=1)
df.head(5)
| RegionID | SizeRank | RegionName | RegionType | StateName | State | City | Metro | CountyName | 2000-01-31 | ... | 2024-04-30 | 2024-05-31 | 2024-06-30 | 2024-07-31 | 2024-08-31 | 2024-09-30 | 2024-10-31 | 2024-11-30 | 2024-12-31 | GeoFIPS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 91982 | 1 | 77494 | zip | TX | TX | Katy | Houston-The Woodlands-Sugar Land, TX | Fort Bend County | 209183.857849 | ... | 493431.867652 | 494729.768844 | 495028.711253 | 494651.061479 | 495058.369196 | 495747.641095 | 496870.232365 | 497591.216589 | 498013.381149 | 48157 |
| 1 | 61148 | 2 | 8701 | zip | NJ | NJ | Lakewood | New York-Newark-Jersey City, NY-NJ-PA | Ocean County | 154922.985332 | ... | 699661.801780 | 708559.971483 | 715094.582736 | 720443.771066 | 726000.038135 | 733103.210025 | 740083.000281 | 745160.491341 | 747964.767711 | 34029 |
| 2 | 91940 | 3 | 77449 | zip | TX | TX | Katy | Houston-The Woodlands-Sugar Land, TX | Harris County | 103712.508410 | ... | 281717.848068 | 282255.119080 | 282218.313457 | 281986.353330 | 281807.105654 | 281620.894517 | 281206.739260 | 280422.308405 | 279697.082664 | 48201 |
| 3 | 62080 | 4 | 11368 | zip | NY | NY | New York | New York-Newark-Jersey City, NY-NJ-PA | Queens County | 302696.952652 | ... | 895038.513007 | 901396.636146 | 901967.282407 | 900606.279548 | 899885.353997 | 902459.710773 | 904617.265178 | 905818.780045 | 905896.339783 | 36081 |
| 4 | 91733 | 5 | 77084 | zip | TX | TX | Houston | Houston-The Woodlands-Sugar Land, TX | Harris County | 102666.820965 | ... | 275605.670852 | 276105.868652 | 275935.732220 | 275641.755834 | 275374.731211 | 275130.328740 | 274631.886651 | 274039.496837 | 273706.228700 | 48201 |
5 rows × 310 columns
combined_df_Personal_income.to_csv("C:\\Users\\req98376\\Downloads\\pi.csv")
df.to_csv("C:\\Users\\req98376\\Downloads\\df.csv")
#Join the two DataFrames on the GeoFIPS column
#combined_df = pd.merge(combined_df_Personal_income, df, on='GeoFIPS', how='inner')
#Display the first few rows of the combined DataFrame
#print(combined_df.head())
##pip install folium
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
# Load the data from the CSV files
zip_lat_long_data = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\USZipsWithLatLon_20231227.csv') # File with zip codes, latitude, and longitude
zip_data = df #pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv') # File with zip codes
# Merge the data based on zip codes
merged_data = pd.merge(zip_data, zip_lat_long_data, left_on='RegionName', right_on='postal code')
#merged_data.head(2)
# Draw the Map
sanfran_map=folium.Map(location=[41.2403, -95.7129], zoom_start=4)
incidents = folium.map.FeatureGroup()
for lat, lng, in zip(merged_data.latitude ,merged_data.longitude):
incidents.add_child(
folium.features.CircleMarker(
[lat, lng],
radius=5,
color='yellow',
fill=True,
fill_color='blue',
fill_opacity=0.6,
)
)
# add incidents to map
sanfran_map.add_child(incidents)